/* Matthew C Mahutga, Michaela Curran, and Anthony Roberts
   matthew.mahutga@ucr.edu
   Job Tasks and the Comparative Structure of Income and Employment: Routine Task Intensity and Offshorability for the LIS
   International Journal of Comparative Sociology 2018
   
   Description: This script builds the RTI/OFFS dataset from the Microsoft Excel file that contains all of the LIS occupational category recodes (occ1_c).
   Please adjust file names to your file paths where noted with [adjust file path]
   Date Modified: February 5, 2018 */

* Session set-up
set more off
clear all
label drop _all

/* Step 1: Change the directory to the location where all files are stored */

cd "C:\Users\Michaela\Google Drive\NSF\Validation\IJCS\RR\Accepted Docs\Items for Website\On Website" // [adjust file path]

/* Step 2: Import the complete recodes Excel file's manifest and store the dataset names */
import excel "lis_occ_recodes.xlsx", sheet("File Manifest") firstrow case(lower)

levelsof dname, local(dname)
clear

/* Step 3: Use the list of stored dataset names to import each tab (named for dnames) of lis_occ_recodes.xlsx, create temp files, and append */
foreach l of local dname{
	preserve
	import excel "lis_occ_recodes.xlsx", sheet("`l'") firstrow case(lower) allstring clear
	gen dname="`l'"
	order dname, first
	tempfile temp`l'
	save `temp`l''
	restore
	append using `temp`l''
}

tempfile lis_recodes_complete
save `lis_recodes_complete'

/* Step 4: Get country name, survey year, classification scheme name, level, and country-year replication flag */
clear
import excel "lis_occ_recodes.xlsx", sheet("File Manifest") firstrow case(lower)
merge 1:m dname using `lis_recodes_complete'
drop _merge

/* Step 5: Clean-up, label, and notate variables and dataset */
notes: This dataset contains Routine Task Intensity (RTI) and Offshorability (OFFS) scores for occupations for 178 LIS country-years. It generates these scores through a series of weighted ISCO-88 recodes. Please see Mahutga, Curran, and Roberts (ADD DATE) for details.
drop ak
order class_name class_level replication_dname, last
label variable dname "country/year identifier"
replace dname=lower(dname)
notes dname: LIS country code + last two digits of year
label variable country "Country Name"
label variable year "Year"
label variable occ1_c "occupation, job 1"
label variable lis_label "Occupation label, LIS codebook"
order text_label eng_label, after(lis_label)
label variable text_label "Cleaned Occupation Label, LIS codebook"
notes text_label: In some instances, the LIS codebook had unusual characters that had to be cleaned
label variable eng_label "Translated (English) Occupation Label, if applicable"
notes eng_label: In some instances, the LIS codebook was in the country's official language. In these cases, we translated them to English using a combination of our own knowledge and translation software (Google Translate and Babylon)
destring occ1_c isco_88_code* isco_88_percode*, replace
forvalues i=1/11{
	label variable isco_88_code`i' "ISCO-88 Occupation Code, Recode `i'"
	label variable isco_88_label`i' "ISCO-88 Occupation Label, Recode `i'"
 	label variable isco_88_percode`i' "ISCO-88 Occupational Percent Wgt, Recode `i'"
}
label variable class_name "Original Occupation Classification Scheme in LIS"
label variable class_level "Digit level of Original Occupation Classification Scheme in LIS"
label variable replication_dname "Country-Year was utilized in paper analysis"
note class_level: Digit level refers to the level of detail of the scheme. For example, two digit refers to second level headings, three digit to third, etc. Higher numbers mean more detailed categories.

/* Step 6: Recode the 9999 “UNKNOWN” values to missing for each isco_88_code, isco_88_label, and isco_88_percode, where relevant */
recode isco_88_code1 9999=.
replace isco_88_label1="" if isco_88_label1=="UNKNOWN"
recode isco_88_percode1 100=. if  isco_88_code1==.&isco_88_label1==""
save `lis_recodes_complete', replace

/* Step 7: Prepare percentages for weighting procedure by dividing each isco_88_percode variable by 100 */
forvalues i=1/11{
	replace isco_88_percode`i'=isco_88_percode`i'/100
}

/* Step 8: Loop through each isco_88_code and isco_88_percode, generating associated RTI and OFFS scores, and weighting these RTI and OFFS scores */
forvalues i=1/11{
	gen rti`i'=.
	replace rti`i'=-.570638 if isco_88_code`i'==11
	replace rti`i'=-.6505817 if isco_88_code`i'==12
	replace rti`i'=-1.445288 if isco_88_code`i'==13
	replace rti`i'=-.7274764 if isco_88_code`i'==21
	replace rti`i'=-.9099581 if isco_88_code`i'==22
	replace rti`i'=-1.465365 if isco_88_code`i'==23
	replace rti`i'=-.6357164 if isco_88_code`i'==24
	replace rti`i'=-.2923957 if isco_88_code`i'==31
	replace rti`i'=-.226255 if isco_88_code`i'==32
	replace rti`i'=-1.368632 if isco_88_code`i'==33
	replace rti`i'=-.3385954 if isco_88_code`i'==34
	replace rti`i'=2.410058 if isco_88_code`i'==41
	replace rti`i'=1.555783 if isco_88_code`i'==42
	replace rti`i'=-.4976501 if isco_88_code`i'==51
	replace rti`i'=.1693505 if isco_88_code`i'==52
	replace rti`i'=.1440091 if isco_88_code`i'==61
	replace rti`i'=.4728294 if isco_88_code`i'==62
	replace rti`i'=-.0752973 if isco_88_code`i'==71
	replace rti`i'=.5826445 if isco_88_code`i'==72
	replace rti`i'=1.742398 if isco_88_code`i'==73
	replace rti`i'=1.382539 if isco_88_code`i'==74
	replace rti`i'=.445601 if isco_88_code`i'==81
	replace rti`i'=.6191809 if isco_88_code`i'==82
	replace rti`i'=-1.417865 if isco_88_code`i'==83
	replace rti`i'=.1426892 if isco_88_code`i'==91
	replace rti`i'=.3803685 if isco_88_code`i'==92
	replace rti`i'=.5742638 if isco_88_code`i'==93
	
	gen weightedrti`i'=rti`i'*isco_88_percode`i'
	
	label variable rti`i' "RTI Score for ISCO-88 Recode `i'"
	label variable weightedrti`i' "Weighted RTI Score for ISCO-88 Recode `i'"
	
	order rti`i' weightedrti`i', after(isco_88_percode`i')
}
	
forvalues i=1/11{
    gen offs`i'=.
    replace offs`i'=-0.4182 if isco_88_code`i'==11
    replace offs`i'=-0.13088 if isco_88_code`i'==12
    replace offs`i'=-0.4588 if isco_88_code`i'==13
    replace offs`i'=1.308184 if isco_88_code`i'==21
    replace offs`i'=-0.58951 if isco_88_code`i'==22
    replace offs`i'=-0.70319 if isco_88_code`i'==23
    replace offs`i'=0.426383 if isco_88_code`i'==24
    replace offs`i'=0.075538 if isco_88_code`i'==31
    replace offs`i'=-0.58453 if isco_88_code`i'==32
    replace offs`i'=-0.64814 if isco_88_code`i'==33
    replace offs`i'=0.313463 if isco_88_code`i'==34
    replace offs`i'=0.622611 if isco_88_code`i'==41
    replace offs`i'=-0.05574 if isco_88_code`i'==42
    replace offs`i'=-0.77659 if isco_88_code`i'==51
    replace offs`i'=-0.73008 if isco_88_code`i'==52
    replace offs`i'=-0.84081 if isco_88_code`i'==61
    replace offs`i'=-0.84081 if isco_88_code`i'==62
    replace offs`i'=-0.77321 if isco_88_code`i'==71
    replace offs`i'=-0.26801 if isco_88_code`i'==72
    replace offs`i'=1.942179 if isco_88_code`i'==73
    replace offs`i'=1.406988 if isco_88_code`i'==74
    replace offs`i'=1.87082  if isco_88_code`i'==81
    replace offs`i'=2.659787 if isco_88_code`i'==82
    replace offs`i'=-0.84081 if isco_88_code`i'==83
    replace offs`i'=-0.6415 if isco_88_code`i'==91
    replace offs`i'=-0.84081 if isco_88_code`i'==92
    replace offs`i'=-0.48433 if isco_88_code`i'==93
            
    gen weightedoffs`i'=offs`i'*isco_88_percode`i'
	
	label variable offs`i' "OFFS Score for ISCO-88 Recode `i'"
	label variable weightedoffs`i' "Weighted OFFS Score for ISCO-88 Recode `i'"
	
	order offs`i' weightedoffs`i', after(weightedrti`i')
}

/* Step 9: Sum all weighted RTI and OFFS scores to create composite RTI and OFFS scores. See Mahutga, Curran, and Roberts (2018)
for information about the weighting procedure */
egen rti_score=rowtotal(weightedrti*), missing
label variable rti_score "Total Weighted RTI Score"

egen offs_score=rowtotal(weightedoffs*), missing
label variable offs_score "Total Weighted OFFS Score"

/* Step 10: Generate indicator variables to note if a weighting procedure was used (i.e., isco_88_percode1 was not equal to 100%_ to create
the RTI and OFFS scores and move variables */
gen rti_weighted=1 if isco_88_percode1<1&isco_88_percode1!=.
recode rti_weighted .=0
label variable rti_weighted "Indicator Variable for Weighted RTI Score"

gen offs_weighted=1 if isco_88_percode1<1&isco_88_percode1!=.
recode offs_weighted .=0
label variable offs_weighted "Indicator Variable for Weighted OFFS Score"

order rti_score offs_score rti_weighted offs_weighted, before(class_name)

/* Step 11: Generate recoded version of LIS occa1 (three category) variable */
gen occa1_r=.

forvalues i=1/11{
	replace occa1_r=1 if isco_88_code`i'==11&isco_88_percode`i'>.5 | isco_88_code`i'==12&isco_88_percode`i'>.5 | isco_88_code`i'==13&isco_88_percode`i'>.5 | isco_88_code`i'==21&isco_88_percode`i'>.5 | isco_88_code`i'==22&isco_88_percode`i'>.5 | isco_88_code`i'==23&isco_88_percode`i'>.5 | isco_88_code`i'==24&isco_88_percode`i'>.5
	replace occa1_r=2 if isco_88_code`i'==31&isco_88_percode`i'>.5 | isco_88_code`i'==32&isco_88_percode`i'>.5 | isco_88_code`i'==33&isco_88_percode`i'>.5 | isco_88_code`i'==34&isco_88_percode`i'>.5 | isco_88_code`i'==41&isco_88_percode`i'>.5 | isco_88_code`i'==42&isco_88_percode`i'>.5 | isco_88_code`i'==51&isco_88_percode`i'>.5 | isco_88_code`i'==52&isco_88_percode`i'>.5 ///
						   | isco_88_code`i'==61&isco_88_percode`i'>.5 | isco_88_code`i'==62&isco_88_percode`i'>.5 | isco_88_code`i'==71&isco_88_percode`i'>.5 | isco_88_code`i'==72&isco_88_percode`i'>.5 | isco_88_code`i'==73&isco_88_percode`i'>.5 | isco_88_code`i'==74&isco_88_percode`i'>.5 | isco_88_code`i'==81&isco_88_percode`i'>.5 ///
						   | isco_88_code`i'==82&isco_88_percode`i'>.5 | isco_88_code`i'==83&isco_88_percode`i'>.5
	replace occa1_r=3 if isco_88_code`i'==91&isco_88_percode`i'>.5 | isco_88_code`i'==92&isco_88_percode`i'>.5 | isco_88_code`i'==93&isco_88_percode`i'>.5
}

label variable occa1_r "LIS occa1 recode based on ISCO-88 recodes"
note occa1_r: LIS occa1 (three category) assigns a code based on whether or not one of the ISCO-88 recodes is over .5 percent. The recode receives the ISCO code for the highest percentage category (over .5 percent)

/* Step 12: Generate recoded version of LIS occb1 (ten category) variable */
gen occb1_r=.

forvalues i=1/11{
	replace occb1_r=1 if isco_88_code`i'==11&isco_88_percode`i'>.5 | isco_88_code`i'==12&isco_88_percode`i'>.5 | isco_88_code`i'==13&isco_88_percode`i'>.5
	replace occb1_r=2 if isco_88_code`i'==21&isco_88_percode`i'>.5 | isco_88_code`i'==22&isco_88_percode`i'>.5 | isco_88_code`i'==23&isco_88_percode`i'>.5 | isco_88_code`i'==24&isco_88_percode`i'>.5
	replace occb1_r=3 if isco_88_code`i'==31&isco_88_percode`i'>.5 | isco_88_code`i'==32&isco_88_percode`i'>.5 | isco_88_code`i'==33&isco_88_percode`i'>.5 | isco_88_code`i'==34&isco_88_percode`i'>.5 
	replace occb1_r=4 if isco_88_code`i'==41&isco_88_percode`i'>.5 | isco_88_code`i'==42&isco_88_percode`i'>.5 
	replace occb1_r=5 if isco_88_code`i'==51&isco_88_percode`i'>.5 | isco_88_code`i'==52&isco_88_percode`i'>.5
	replace occb1_r=6 if isco_88_code`i'==61&isco_88_percode`i'>.5 | isco_88_code`i'==62&isco_88_percode`i'>.5 
	replace occb1_r=7 if isco_88_code`i'==71&isco_88_percode`i'>.5 | isco_88_code`i'==72&isco_88_percode`i'>.5 | isco_88_code`i'==73&isco_88_percode`i'>.5 | isco_88_code`i'==74&isco_88_percode`i'>.5 
	replace occb1_r=8 if isco_88_code`i'==81&isco_88_percode`i'>.5 | isco_88_code`i'==82&isco_88_percode`i'>.5 | isco_88_code`i'==83&isco_88_percode`i'>.5 
	replace occb1_r=9 if isco_88_code`i'==91&isco_88_percode`i'>.5 | isco_88_code`i'==92&isco_88_percode`i'>.5 | isco_88_code`i'==93&isco_88_percode`i'>.5 
}

label variable occb1_r "LIS occb1 recode based on ISCO-88 recodes"
note occb1_r: This LIS occb1 (ten category) recode assigns a code based on whether or not one of the ISCO-88 recodes is over 50 percent. The recode receives the ISCO code for the highest percentage category (over 50 percent)

/* Step 13: Fix some peculiarities in certain LIS ISCO-88 datasets for occa1 and occb1 recodes */
* Fixes for occa1
* Belgium
replace occa1_r=1 if occ1_c==1000&regexm(dname, "be")
replace occa1_r=1 if occ1_c==2000&regexm(dname, "be")
replace occa1_r=2 if occ1_c==3000&regexm(dname, "be")
replace occa1_r=2 if occ1_c==4000&regexm(dname, "be")
replace occa1_r=2 if occ1_c==5000&regexm(dname, "be")
replace occa1_r=2 if occ1_c==7000&regexm(dname, "be")
replace occa1_r=2 if occ1_c==8000&regexm(dname, "be")
replace occa1_r=3 if occ1_c==9000&regexm(dname, "be")

* Denmark 1992, 2004, and 2007
replace occa1_r=1 if occ1_c==1000&dname=="dk92" | occ1_c==1000&dname=="dk04" | occ1_c==1000&dname=="dk07"
replace occa1_r=1 if occ1_c==2000&dname=="dk92" | occ1_c==2000&dname=="dk04" | occ1_c==2000&dname=="dk07"
replace occa1_r=2 if occ1_c==3000&dname=="dk92" | occ1_c==3000&dname=="dk04" | occ1_c==3000&dname=="dk07"
replace occa1_r=2 if occ1_c==4000&dname=="dk92" | occ1_c==4000&dname=="dk04" | occ1_c==4000&dname=="dk07"
replace occa1_r=2 if occ1_c==5000&dname=="dk92" | occ1_c==5000&dname=="dk04" | occ1_c==5000&dname=="dk07"
replace occa1_r=2 if occ1_c==6000&dname=="dk92" | occ1_c==6000&dname=="dk04" | occ1_c==6000&dname=="dk07"
replace occa1_r=2 if occ1_c==7000&dname=="dk92" | occ1_c==7000&dname=="dk04" | occ1_c==7000&dname=="dk07"
replace occa1_r=2 if occ1_c==8000&dname=="dk92" | occ1_c==8000&dname=="dk04" | occ1_c==8000&dname=="dk07"
replace occa1_r=3 if occ1_c==9000&dname=="dk92" | occ1_c==9000&dname=="dk04" | occ1_c==9000&dname=="dk07"

* Estonia 2000
replace occa1_r=1 if occ1_c==2000&dname=="ee00"

* Hungary 2005
replace occa1_r=2 if occ1_c==4000&dname=="hu05"
replace occa1_r=2 if occ1_c==6000&dname=="hu05"
replace occa1_r=2 if occ1_c==7000&dname=="hu05"

* Slovakia 1992
replace occa1_r=2 if occ1_c==30&dname=="sk92"
replace occa1_r=2 if occ1_c==60&dname=="sk92"
replace occa1_r=2 if occ1_c==70&dname=="sk92"
replace occa1_r=3 if occ1_c==90&dname=="sk92"

* Slovenia
replace occa1_r=1 if occ1_c==1000&regexm(dname, "si")
replace occa1_r=2 if occ1_c==4000&regexm(dname, "si")
replace occa1_r=2 if occ1_c==7000&regexm(dname, "si")
	
* Fixes for occb1
* Belgium
replace occb1_r=1 if occ1_c==1000&regexm(dname, "be")
replace occb1_r=2 if occ1_c==2000&regexm(dname, "be")
replace occb1_r=3 if occ1_c==3000&regexm(dname, "be")
replace occb1_r=4 if occ1_c==4000&regexm(dname, "be")
replace occb1_r=5 if occ1_c==5000&regexm(dname, "be")
replace occb1_r=7 if occ1_c==7000&regexm(dname, "be")
replace occb1_r=8 if occ1_c==8000&regexm(dname, "be")
replace occb1_r=9 if occ1_c==9000&regexm(dname, "be")

* Denmark 1992, 2004, and 2007
replace occb1_r=1 if occ1_c==1000&dname=="dk92" | occ1_c==1000&dname=="dk04" | occ1_c==1000&dname=="dk07"
replace occb1_r=2 if occ1_c==2000&dname=="dk92" | occ1_c==2000&dname=="dk04" | occ1_c==2000&dname=="dk07"
replace occb1_r=3 if occ1_c==3000&dname=="dk92" | occ1_c==3000&dname=="dk04" | occ1_c==3000&dname=="dk07"
replace occb1_r=4 if occ1_c==4000&dname=="dk92" | occ1_c==4000&dname=="dk04" | occ1_c==4000&dname=="dk07"
replace occb1_r=5 if occ1_c==5000&dname=="dk92" | occ1_c==5000&dname=="dk04" | occ1_c==5000&dname=="dk07"
replace occb1_r=6 if occ1_c==6000&dname=="dk92" | occ1_c==6000&dname=="dk04" | occ1_c==6000&dname=="dk07"
replace occb1_r=7 if occ1_c==7000&dname=="dk92" | occ1_c==7000&dname=="dk04" | occ1_c==7000&dname=="dk07"
replace occb1_r=8 if occ1_c==8000&dname=="dk92" | occ1_c==8000&dname=="dk04" | occ1_c==8000&dname=="dk07"
replace occb1_r=9 if occ1_c==9000&dname=="dk92" | occ1_c==9000&dname=="dk04" | occ1_c==9000&dname=="dk07"	

* Estonia 2000
replace occb1_r=2 if occ1_c==2000&dname=="ee00"

* Hungary 2005
replace occb1_r=4 if occ1_c==4000&dname=="hu05"
replace occb1_r=6 if occ1_c==6000&dname=="hu05"
replace occb1_r=7 if occ1_c==7000&dname=="hu05"

* Slovakia 1992
replace occb1_r=3 if occ1_c==30&dname=="sk92"
replace occb1_r=6 if occ1_c==60&dname=="sk92"
replace occb1_r=7 if occ1_c==70&dname=="sk92"
replace occb1_r=9 if occ1_c==90&dname=="sk92"

* Slovenia
replace occb1_r=1 if occ1_c==1000&regexm(dname, "si")
replace occb1_r=4 if occ1_c==4000&regexm(dname, "si")
replace occb1_r=7 if occ1_c==7000&regexm(dname, "si")

/* Step 14: Generate ISCO-88 recode variable for use in replicating the validation analyses */
gen isco_88_r=.
forvalues i=1/11{  
	replace isco_88_r=isco_88_code`i' if isco_88_percode`i'>.5&isco_88_percode`i'!=. 
}  
replace isco_88_r=. if isco_88_r==9999 

label variable isco_88_r "Occupation recode to an ISCO-88 category over 50%"
note isco_88_r: This variable is used in the validation analyses in Mahutga, Curran, and Roberts (2018). It assigns an ISCO-88 occupation category to occ1_c on the basis of a 50% rule. If an ISCO-88 recode is weighted over 50%, this variable is assigned that category.

/* Step 15: Generate paper replication flag for each occupation recode */
gen replication_flag=. 
forv i=1/11{  
	replace replication_flag=1 if isco_88_percode`i'>.5&isco_88_percode`i'!=.&replication_dname==1
} 
recode replication_flag .=0
label variable replication_flag "Occupation Recodes and Scores were utilized in Mahutga, Curran, and Roberts (2018)"

/* Step 16: Uncomment lines where appropriate to save the full and/or abbreviated datasets */
* Uncomment the save statement below to generate the full dataset with all variables
* save "lis_recodes_complete.dta", replace // [adjust file path if you want to save dataset to a different folder]

* Uncomment the export statement below to save a full Excel version of the dataset
* export excel using "lis_recodes_complete.xlsx", firstrow(variables) replace // [adjust file path if you want to save dataset to a different folder]

* Keep only relevant variables for LIS merge
order isco_88_r occa1_r occb1_r, after(occ1_c)
keep dname occ1_c isco_88_r occa1_r occb1_r rti_score rti_weighted offs_score offs_weighted replication_flag
save "lis_rti_offs.dta", replace // [adjust file path if you want to save dataset to a different folder]

* Uncomment the export statement below to save an Excel version of the dataset
* export excel using "lis_rti_offs.xlsx", firstrow(variables) replace // [adjust file path if you want to save dataset to a different folder]

exit
